data <- read.table("~/Documents/BRAIN/gitrepo/zinb_analysis/sims/datasets/expression_mRNA_17-Aug-2014.txt", sep='\t', stringsAsFactors = FALSE, comment.char = '%')
level1 <- as.factor(as.matrix(data)[9,-(1:2)])
tissue <- as.factor(as.matrix(data)[1,-(1:2)])
group <- as.factor(as.matrix(data)[2,-(1:2)])
nmolecule <- as.numeric(as.matrix(data)[3,-(1:2)])
sex <- as.factor(as.matrix(data)[5,-(1:2)])
level2 <- as.factor(as.matrix(data)[10,-(1:2)])
counts <- as.matrix(data[12:NROW(data),-(1:2)])
counts <- matrix(as.numeric(counts), ncol=ncol(counts), nrow=nrow(counts))
rownames(counts) <- data[12:NROW(data),1]
colnames(counts) <- data[8, -(1:2)]
filter = sample(1:ncol(counts), 300, replace = F)
counts = counts[, filter]
group = droplevels(group[filter])
sex = sex[filter]
tissue = tissue[filter]
level1 = droplevels(level1[filter])
level2 = level2[filter]
filterGenes = apply(counts > 5, 1, sum) >= 5
counts <- counts[filterGenes, ]
#vars <- rowVars(log1p(counts))
#names(vars) <- rownames(counts)
#vars <- sort(vars, decreasing = TRUE)
#core <- counts[names(vars)[1:1000],]
core <- counts[sample(1:nrow(counts), 1000),]
sum(core == 0)/(ncol(core) * nrow(core))
## [1] 0.45793
col1 <- brewer.pal(9, "Set1")
col2 <- brewer.pal(8, "Set2")
colTissue <- col1[tissue]
colMerged <- col2[level1]
We use Zeisel dataset with all cells and only 1000 randomly selected genes. The dimensions are
dim(core)
## [1] 1000 300
table(tissue)
## tissue
## ca1hippocampus sscortex
## 136 164
table(tissue, group)
## group
## tissue 1 2 3 4 5 6 7 8 9
## ca1hippocampus 16 2 92 10 2 4 5 1 4
## sscortex 17 36 1 69 8 15 13 0 5
table(tissue, sex)
## sex
## tissue -1 0 1
## ca1hippocampus 84 1 51
## sscortex 62 0 102
table(level1)
## level1
## astrocytes_ependymal endothelial-mural interneurons
## 19 28 33
## microglia oligodendrocytes pyramidal CA1
## 10 79 94
## pyramidal SS
## 37
Colors in left panel correspond to ? . Colors in right panel correspond to ?.
par(mfrow=c(1, 2))
fq <- EDASeq::betweenLaneNormalization(core, which="full")
pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=20, main="PCA of FQ log-counts, centered not scaled")
plot(pcafq$x, col=colTissue, pch=20, main="PCA of FQ log-counts, centered not scaled")
Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
commondispersion = FALSE)))
## user system elapsed
## 193.585 78.733 123.975
Check correlations between \(W\), \(\gamma_{mu}\), \(\gamma_{pi}\), detection rate, and coverage.
\(W_1\) is correlated with \(\gamma_{\mu}\) but not \(\gamma_{\pi}\).
\(\gamma_{\mu}\) and \(\gamma_{\pi}\) are correlated.
\(\gamma_{\mu}\) is correlated with coverage.
\(\gamma_{\pi}\) is correlated with detection rate.
In what follow, we assume that \(W\) and \(\gamma_{\pi}\) are independent. We simulate \(W\), \(\gamma_{\pi}\), and \(\gamma_{\mu}\) separetely.
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
par(mfrow = c(1,1))
plot(zinb@W, col = colMerged, pch = 19, xlab = 'W1', ylab = 'W2', main = 'W')
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 detection_rate coverage
## W1 1.00000000 -0.01421971 0.7699318 0.6473462
## W2 -0.01421971 1.00000000 -0.1120463 -0.1601473
## detection_rate 0.76993179 -0.11204633 1.0000000 0.9336435
## coverage 0.64734621 -0.16014731 0.9336435 1.0000000
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
gamma_mu = zinb@gamma_mu[1, ],
gamma_pi = zinb@gamma_pi[1, ])
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 -0.01421971 0.46683363 -0.6656167
## W2 -0.01421971 1.00000000 0.01184591 0.1712210
## gamma_mu 0.46683363 0.01184591 1.00000000 -0.6436672
## gamma_pi -0.66561673 0.17122101 -0.64366715 1.0000000
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
gamma_mu = zinb@gamma_mu[1, ],
gamma_pi = zinb@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 -0.01421971 0.46683363 -0.6656167
## W2 -0.01421971 1.00000000 0.01184591 0.1712210
## gamma_mu 0.46683363 0.01184591 1.00000000 -0.6436672
## gamma_pi -0.66561673 0.17122101 -0.64366715 1.0000000
## detection_rate 0.76993179 -0.11204633 0.81177969 -0.9248605
## coverage 0.64734621 -0.16014731 0.92242738 -0.8189961
## detection_rate coverage
## W1 0.7699318 0.6473462
## W2 -0.1120463 -0.1601473
## gamma_mu 0.8117797 0.9224274
## gamma_pi -0.9248605 -0.8189961
## detection_rate 1.0000000 0.9336435
## coverage 0.9336435 1.0000000
Additionally, let’s check \(\alpha\) and \(\beta\).
df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
alpha_mu_2 = zinb@alpha_mu[2, ],
alpha_pi_1 = zinb@alpha_pi[1, ],
alpha_pi_2 = zinb@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 0.01469818 -0.4629298 -0.1623635
## alpha_mu_2 0.01469818 1.00000000 -0.2132317 -0.4163138
## alpha_pi_1 -0.46292977 -0.21323174 1.0000000 0.2318247
## alpha_pi_2 -0.16236350 -0.41631384 0.2318247 1.0000000
df <- data.frame(beta_mu = zinb@beta_mu[1, ],
beta_pi = zinb@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.3725573
## beta_pi -0.3725573 1.0000000
Pick number of cells to simulate. Here
ncells = 100
ncells
## [1] 100
We simulate \(W\), \(\gamma_{\mu}\), and \(\gamma_{\pi}\) from three multivariate gaussians.
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
gamma_mu = zinb@gamma_mu[1, ],
gamma_pi = zinb@gamma_pi[1, ])
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 -0.01421971 0.46683363 -0.6656167
## W2 -0.01421971 1.00000000 0.01184591 0.1712210
## gamma_mu 0.46683363 0.01184591 1.00000000 -0.6436672
## gamma_pi -0.66561673 0.17122101 -0.64366715 1.0000000
# mclust
mclust = Mclust(df, G = 3)
pairs(mclust$data, col = mclust$classification)
# multivar gaussian
clust = sample(mclust$classification, ncells)
sim = lapply(clust, function(i){
mvrnorm(n = 1, mu = mclust$parameters$mean[, i],
Sigma = mclust$parameters$variance$sigma[,, i])
})
sim = do.call(rbind, sim)
bio = clust
pairs(sim, col = bio)
print(cor(sim, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.0000000 -0.12055206 0.50000600 -0.6895890
## W2 -0.1205521 1.00000000 -0.04418842 0.2045245
## gamma_mu 0.5000060 -0.04418842 1.00000000 -0.5445305
## gamma_pi -0.6895890 0.20452445 -0.54453045 1.0000000
par(mfrow = c(1, 1))
sim_W = sim[, 1:2]
sim_gamma_mu = sim[, 3]
sim_gamma_pi = sim[, 4]
Let’s simulate counts from the simulated \(W\), \(\gamma_{\mu}\), and \(\gamma_{\pi}\). Zero fraction of the simulated counts is
mod = zinbModel(W=sim_W, gamma_mu = matrix(sim_gamma_mu, nrow = 1),
gamma_pi = matrix(sim_gamma_pi, nrow = 1),
alpha_mu=zinb@alpha_mu, alpha_pi=zinb@alpha_pi,
beta_mu=zinb@beta_mu, beta_pi=zinb@beta_pi, zeta = zinb@zeta)
sim = zinbSim(mod, seed = 8746)
core = t(sim$counts)
dim(core)
## [1] 1000 100
sum(core == 0)/(nrow(core) * ncol(core))
## [1] 0.46756
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=clust, pch=19, main="PCA of log-counts")
We want three different zero fractions (25%, 50%, 75%).
gammaOffset = c('25' = -3.2, '50' = .3, '75' = 2.7)
gammaPi = sapply(gammaOffset, function(x){
sim_gamma_pi + x
})
ggplot(melt(gammaPi), aes(x = factor(X2), y = value)) + theme_bw() +
geom_boxplot() + xlab('zero fraction') +
theme_bw() + ylab('gamma_pi')
models = lapply(1:3, function(i){
zinbModel(W=sim_W, gamma_mu = matrix(sim_gamma_mu, nrow = 1),
gamma_pi = matrix(gammaPi[,i], nrow = 1),
alpha_mu=zinb@alpha_mu, alpha_pi=zinb@alpha_pi,
beta_mu=zinb@beta_mu, beta_pi=zinb@beta_pi, zeta = zinb@zeta)
})
## check
for (i in 1:3){
mm = models[[i]]
cc = data.frame(mm@W[,1], mm@W[,2], mm@gamma_mu[1,], mm@gamma_pi[1,])
print(cor(cc, method="spearman"))
}
## mm.W...1. mm.W...2. mm.gamma_mu.1... mm.gamma_pi.1...
## mm.W...1. 1.0000000 -0.12055206 0.50000600 -0.6895890
## mm.W...2. -0.1205521 1.00000000 -0.04418842 0.2045245
## mm.gamma_mu.1... 0.5000060 -0.04418842 1.00000000 -0.5445305
## mm.gamma_pi.1... -0.6895890 0.20452445 -0.54453045 1.0000000
## mm.W...1. mm.W...2. mm.gamma_mu.1... mm.gamma_pi.1...
## mm.W...1. 1.0000000 -0.12055206 0.50000600 -0.6895890
## mm.W...2. -0.1205521 1.00000000 -0.04418842 0.2045245
## mm.gamma_mu.1... 0.5000060 -0.04418842 1.00000000 -0.5445305
## mm.gamma_pi.1... -0.6895890 0.20452445 -0.54453045 1.0000000
## mm.W...1. mm.W...2. mm.gamma_mu.1... mm.gamma_pi.1...
## mm.W...1. 1.0000000 -0.12055206 0.50000600 -0.6895890
## mm.W...2. -0.1205521 1.00000000 -0.04418842 0.2045245
## mm.gamma_mu.1... 0.5000060 -0.04418842 1.00000000 -0.5445305
## mm.gamma_pi.1... -0.6895890 0.20452445 -0.54453045 1.0000000
pal = colorRampPalette(c("black","black", "red","yellow"), space="rgb")
par(mfrow=c(1, 3))
for (k in 1:3){
smoothScatter(colMeans(log1p(getMu(models[[k]]))),
colMeans(getPi(models[[k]])), nbin = 256,
nrpoints = Inf, pch = "", cex = .7, xlim = c(0,8),
xlab = "Average simulated log Mu", main = names(gammaOffset)[k],
ylab = "Average simulated Pi",ylim = c(0,1),
colramp = pal)
}
B = 10
bio = clust
sim = lapply(1:3, function(k){
simModel = models[[k]]
simData = lapply(seq_len(B), function(i){
zinbSim(simModel, seed = i*k)
})
fileName = sprintf("./datasets/simZeisel%s.rda", names(gammaOffset[k]))
save(bio, simModel, simData, file = fileName)
sapply(simData, function(x) x$zeroFraction)
})
zeroFrac = data.frame(do.call(cbind, sim), stringsAsFactors = F)
colnames(zeroFrac) = names(gammaOffset)
ggplot(melt(zeroFrac), aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + xlab('zero fraction') +
theme_bw() + ylab('simulated zero fraction')
Let’s look at one simulated dataset for each zero fraction.
par(mfrow=c(1, 3))
for (k in 1:3){
load(sprintf("./datasets/simZeisel%s.rda", names(gammaOffset[k])))
counts = simData[[1]]$counts
zero_rate <- rowMeans(counts == 0)
plot(rowMeans(getPi(models[[k]])), zero_rate, main = names(gammaOffset)[k],
xlab="Average simulated Pi", ylab="Simulated Zero Rate",
pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
abline(a=0,b=1,col='gray')
}
par(mfrow=c(1, 3))
for (k in 1:3){
load(sprintf("./datasets/simZeisel%s.rda", names(gammaOffset[k])))
counts = simData[[1]]$counts
coverage <- rowSums(counts)
plot(rowMeans(log1p(getMu(models[[k]]))), log1p(coverage), xlim = c(0, 3),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=bio,ylim = c(3,10), main = names(gammaOffset)[k])
}